In [1]:
#imports libaries


import pandas as pd
import numpy as np
import os

import datetime as dt

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
#get current working directory
cwd = os.getcwd()
#create a dataframe from csv file

df = pd.read_csv("airways customer data.csv")
In [3]:
df.head()
Out[3]:
Unnamed: 0 reviews rates date country
0 0 ✅ Trip Verified | Excellent service both on th... \n\t\t\t\t\t\t\t\t\t\t\t\t\t5 24th January 2023 United Kingdom
1 1 ✅ Trip Verified | Good lounge at Cape Town. O... 9 24th January 2023 United Kingdom
2 2 ✅ Trip Verified | A really excellent journey.... 9 23rd January 2023 United Kingdom
3 3 ✅ Trip Verified | This flight was one of the ... 9 21st January 2023 United Kingdom
4 4 Not Verified | It seems that there is a race t... 2 19th January 2023 United States
In [4]:
df['verified'] = df.reviews.str.contains("Trip Verified")
In [5]:
df['verified']
Out[5]:
0        True
1        True
2        True
3        True
4       False
        ...  
3413    False
3414    False
3415    False
3416    False
3417    False
Name: verified, Length: 3418, dtype: bool
In [6]:
df.shape
Out[6]:
(3418, 6)
In [7]:
df.columns
Out[7]:
Index(['Unnamed: 0', 'reviews', 'rates', 'date', 'country', 'verified'], dtype='object')
In [8]:
df.describe()
Out[8]:
Unnamed: 0
count 3418.000000
mean 1708.500000
std 986.835937
min 0.000000
25% 854.250000
50% 1708.500000
75% 2562.750000
max 3417.000000
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3418 entries, 0 to 3417
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3418 non-null   int64 
 1   reviews     3418 non-null   object
 2   rates       3418 non-null   object
 3   date        3418 non-null   object
 4   country     3416 non-null   object
 5   verified    3418 non-null   bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 137.0+ KB
In [10]:
#Cleaning/Format date
df.dtypes
Out[10]:
Unnamed: 0     int64
reviews       object
rates         object
date          object
country       object
verified        bool
dtype: object
In [11]:
df.date = pd.to_datetime(df.date)
In [12]:
df.date.head()
Out[12]:
0   2023-01-24
1   2023-01-24
2   2023-01-23
3   2023-01-21
4   2023-01-19
Name: date, dtype: datetime64[ns]
In [13]:
#Cleaning ratings with stars
#check for unique values
df.rates.unique()
Out[13]:
array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '9', '2', '3', '1', '4', '5', '8',
       '6', '7', '10', 'None'], dtype=object)
In [14]:
# remove the \t and \n from the ratings
df.rates = df.rates.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")
In [15]:
df.rates.value_counts()
Out[15]:
1       745
2       385
3       380
8       344
10      303
7       297
9       293
5       255
4       232
6       179
None      5
Name: rates, dtype: int64
In [16]:
#There are 5 rows having values "None" in the ratings. We will drop all these 5 rows.
# drop the rows where the value of ratings is None
df.drop(df[df.rates == "None"].index, axis=0, inplace=True)
In [17]:
#check the unique values again
df.rates.unique()
Out[17]:
array(['5', '9', '2', '3', '1', '4', '8', '6', '7', '10'], dtype=object)
In [18]:
#Check for null Values
df.isnull().value_counts()
Out[18]:
Unnamed: 0  reviews  rates  date   country  verified
False       False    False  False  False    False       3411
                                   True     False          2
dtype: int64
In [19]:
df.country.isnull().value_counts()
Out[19]:
False    3411
True        2
Name: country, dtype: int64
In [20]:
#We have two missing values for country. For this we can just remove those two reviews (rows) from the dataframe
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)
In [21]:
df.shape
Out[21]:
(3411, 6)
In [22]:
df.isnull().value_counts()
Out[22]:
Unnamed: 0  reviews  rates  date   country  verified
False       False    False  False  False    False       3411
dtype: int64
In [23]:
#resetting the index
df.reset_index(drop=True)
Out[23]:
Unnamed: 0 reviews rates date country verified
0 0 ✅ Trip Verified | Excellent service both on th... 5 2023-01-24 United Kingdom True
1 1 ✅ Trip Verified | Good lounge at Cape Town. O... 9 2023-01-24 United Kingdom True
2 2 ✅ Trip Verified | A really excellent journey.... 9 2023-01-23 United Kingdom True
3 3 ✅ Trip Verified | This flight was one of the ... 9 2023-01-21 United Kingdom True
4 4 Not Verified | It seems that there is a race t... 2 2023-01-19 United States False
... ... ... ... ... ... ...
3406 3413 Flew British Airways from Sydney to Venice on ... 6 2014-05-20 Australia False
3407 3414 Recently for our honeymoon we took advantage o... 10 2014-05-19 United Kingdom False
3408 3415 LGW-Antigua on 29 April and Antigua-LGW on 13 ... 1 2014-05-18 United Kingdom False
3409 3416 LHR-JTR-LHR. A tale of opposites for BA's new ... 4 2014-05-18 United Kingdom False
3410 3417 Gatwick-Venice return. Two contrasting experie... 2 2014-05-18 United Kingdom False

3411 rows × 6 columns

In [24]:
#What is the average overall rating given for British Airways?
df.rates.mean()
Out[24]:
inf
In [25]:
df.rates.value_counts().plot(kind="bar")
plt.xlabel("Ratings")
plt.ylabel("Total Number of reviews with that rating")
plt.suptitle("Counts for each ratings")
Out[25]:
Text(0.5, 0.98, 'Counts for each ratings')
In [26]:
df_ratings = pd.DataFrame(df.rates.value_counts())
pct_values = (df_ratings.rates.values/ df_ratings.rates.values.sum() *100).tolist()
pct_values = [round(x,2) for x in pct_values]
df_ratings['pct_values'] = pct_values
In [27]:
# renaming columns
df_ratings.rename(columns={'index':'rates', 'rates':'total_counts'}, inplace=True)
In [28]:
df_ratings
Out[28]:
total_counts pct_values
1 745 21.84
2 385 11.29
3 380 11.14
8 343 10.06
10 302 8.85
7 297 8.71
9 293 8.59
5 255 7.48
4 232 6.80
6 179 5.25
In [29]:
# Unique countries BA recieved the reviews from

print(f"{len(df.country.unique())} unique countries")
69 unique countries
In [30]:
#Which country most review comes from?
df_country_review = pd.DataFrame(df.country.value_counts().head()).reset_index()
In [31]:
df_country_review.rename(columns={'index':'country','country':'total_reviews'}, inplace=True)
In [32]:
df_country_review.plot(kind="bar", x='country')
plt.title("Maximum number of review by country")
Out[32]:
Text(0.5, 1.0, 'Maximum number of review by country')
In [33]:
#convert the date datatype to datetime

df.date = pd.to_datetime(df.date)
In [34]:
fig = px.line(df, x='date', y="rates")
fig.update_xaxes(rangeslider_visible=True)
fig.show()
In [ ]: